EDA: Overview of Suicide Rate in a Particular Year

Author

Michael

Published

March 3, 2023

Modified

March 3, 2023

Step-by-Step Data Preparation

1. Installing and launching required R packages

pacman::p_load("patchwork", "tmap", "ExPanDaR", "kableExtra", "ggstatsplot", "tidyverse")

2. Loading the data

suicidedata_eda_formap <- read_csv("data/suicidedata_eda_formap.csv", show_col_types = FALSE)
suicidedata_eda <- read_csv("data/suicidedata_eda.csv", show_col_types = FALSE)

3. Dataset overview

We will check the suicidedata_eda as suicidedata_eda_formap is meant to be combined with the sf file (world)

3.1 Missing Values

missing.values <- suicidedata_eda %>%
    gather(key = "key", value = "val") %>%
    mutate(is.missing = is.na(val)) %>%
    group_by(key, is.missing) %>%
    summarise(num.missing = n()) %>%
    filter(is.missing==T) %>%
    select(-is.missing) %>%
    arrange(desc(num.missing)) 
missing.values %>% kable()
key num.missing
DN 18360
DR 18360
POP 18360
SN 18360
SP 18360

Visualising Missing Values

missing_values <- function(vari = "age_name"){
prepare_missing_values_graph(suicidedata_eda, ts_id = vari)
}

By country

missing_values("country")

By gender

missing_values("sex_name")

By age group

missing_values("age_name")

By year

missing_values("year")

Conclusion : The missing values are because of age-standardised (only for Suicide Rates)

3.2 Descriptive statistics

descr_stats <- function(year, gender = "Both", age = "All ages") {
  
  descr <- prepare_descriptive_table(suicidedata_eda %>%
                                       filter(year == year,
                                              sex_name == gender,
                                              age_name == age) %>%
                                       select(!c(6,8)) %>%
                                       rename("Number of suicide" = "SN",
                                              "Number of deaths" = "DN",
                                              "Share of deaths from suicide (%)" = "SP",
                                              "Suicide rate" = "SR",
                                              "Mortality rate" = "DR"))
descr$kable_ret  %>%
  kable_styling("condensed", full_width = F, position = "center")
}
descr_stats(2010, "Both", "All ages")
Descriptive Statistics
N Mean Std. dev. Min. 25 % Median 75 % Max.
Number of suicide 6,120 3,874.828 18,425.617 0.126 93.934 532.835 1,882.305 220,356.720
Number of deaths 6,120 252,265.821 920,824.636 10.886 7,358.955 51,487.069 169,570.300 10,659,491.040
Share of deaths from suicide (%) 6,120 1.394 1.067 0.105 0.691 1.118 1.787 11.812
Suicide rate 6,120 11.339 9.395 1.427 5.424 8.224 14.601 95.565
Mortality rate 6,120 847.052 354.531 148.258 611.956 783.986 1,019.318 10,705.808

3.3 Distribution

3.3.1 Histogram of multiple variables
plot_multi_distribution <- function(year, gender = "Both", age = "All ages") {
  
suicidedata_hist <- suicidedata_eda %>% 
  filter(year == year,
         sex_name == gender,
         age_name == age) %>%
  select(c(7,9,10,11,12)) %>%
  rename("Number of suicide" = "SN",
         "Number of deaths" = "DN",
         "Share of deaths from suicide (%)" = "SP",
         "Suicide rate" = "SR",
         "Mortality rate" = "DR")
  
suicidedata_hist %>% 
  gather() %>%
  ggplot(aes(value)) +
  facet_wrap( ~key, ncol=3, scales="free") +
  geom_histogram()
}
plot_multi_distribution(2010, "Both", "All ages")

3.3.2 Histogram of individual variables
plot_distribution <- function(year, gender = "Both", age = "All ages") {

suicidedata_hist <- suicidedata_eda %>% 
  filter(year == year,
         sex_name == gender,
         age_name == age)
  
#computing summary statistics of mean, median and lower and upper whiskers in boxplot
var_mean <- round(mean(suicidedata_hist$SR))
var_median <- round(median(suicidedata_hist$SR))
ymax <- as.numeric(round((IQR(suicidedata_hist$SR)*1.5) +
                quantile(suicidedata_hist$SR,0.75)))
ymin <- as.integer(min(suicidedata_hist$SR))

#plotting histogram
h <- suicidedata_hist %>%
  ggplot(aes(x = SR)) + 
  geom_histogram(color="black", fill="azure4") + 
  scale_x_continuous(labels = scales::comma) +
  labs(x = paste0("Suicide Rate, ",gender,", ",age,", ", year), y = "Count") +
  geom_vline(aes(xintercept = var_mean), col="darkblue", linewidth=1, linetype = "dashed") +
  annotate("text", x=17, y=1750, label="Mean:", 
           size=4, color="darkblue") +
  annotate("text", x=17, y=1650, label=format(var_mean, big.mark = ","),
           size=4, color="darkblue") +
  geom_vline(aes(xintercept = var_median), col="lightpink4", linewidth=1, linetype = "dashed") +
  annotate("text", x=0, y=1750, label="Median:", 
           size=4, color="lightpink4") +
  annotate("text", x=0, y=1650, label=format(var_median, big.mark = ","),
           size=4, color="lightpink4") +
  theme(axis.text.x = element_text(size=8))

#plotting boxplot
b <- suicidedata_hist %>%
  ggplot(aes(y = SR)) + 
  geom_boxplot(outlier.colour="firebrick", outlier.shape=16,
               outlier.size=1, notch=FALSE) + 
  coord_flip() + labs(y = "", x = "") + 
  scale_y_continuous(labels = scales::comma) +
  theme(axis.text = element_blank(), axis.ticks = element_blank()) + 
  stat_boxplot(geom="errorbar", width=0.5) + 
  annotate("text", x=0.35, y=ymax, label=format(ymax, big.mark = ","), 
           size=3, color="lightpink4") +
  annotate("text", x=0.35, y=ymin, label=format(ymin, big.mark = ","), 
           size=3, color="lightpink4")

#combining plots
suicide_distri <- b / h + plot_layout(heights = c(1, 4)) 

suicide_distri
}
plot_distribution(2010, "Both", "All ages")

4. Plotting the choropleth map

4.1 Joining with world map (tmap object)

The object World is a spatial object of class sf from the sf package; it is a data.frame with a special column that contains a geometry for each row, in this case polygons

Reference - https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.html

data("World")

Joining the two dataframes together

suicidedata_eda_map <- left_join(World, 
                                 suicidedata_eda_formap %>% mutate(across(where(is.numeric), round, 2)),
                          by = c("iso_a3" = "code")) %>%
  select(!c(2,3,4,6,7,8,9,10,11,12,13,14,15)) %>%
  mutate(area = as.numeric(str_remove(`area`, 
                                " \\[km\\^2\\]")), 
         .after = region) %>%
  na.omit()

Creating a function to plot

plot_map_eda <- function(year, metric = "SR", gender = "T", style = "jenks"){
  
metric_text = case_when(metric == "SR" ~ "Suicide rate",
                        metric == "SP" ~ "Share of deaths from suicide (%)",
                        metric == "SN" ~ "Number of suicide",
                        metric == "DN" ~ "Number of deaths",
                        metric == "DR" ~ "Mortality rate")

age = case_when(metric == "SR" ~ "AS",
                metric == "SP" ~ "All",
                metric == "SN" ~ "All",
                metric == "DN" ~ "All",
                metric == "DR" ~ "All")

gender_text = case_when(gender == "T" ~ "Total",
                        gender == "M" ~ "Male",
                        gender == "F" ~ "Female")
  
tmap_mode("view")

tm_shape(suicidedata_eda_map |> 
           filter(year == year))+
  tm_fill(paste0(metric,"_",age,"_",gender), 
          style = style, 
          palette="YlOrBr", 
          id = "country",
          title = paste0(metric_text, ", ",gender_text,", ", year),
          popup.vars = c(value = paste0(metric,"_",age,"_",gender))) +
  
  tm_borders(col = "grey20",
             alpha = 0.5) 
}
plot_map_eda(2016, "SR", "M", "jenks")
tmap_mode("plot")